home *** CD-ROM | disk | FTP | other *** search
/ Clickx 47 / Clickx 47.iso / assets / software / Miro_Installer.exe / Miro_Downloader.exe / rdfa.pyc (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2008-01-10  |  7.8 KB  |  297 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.5)
  3.  
  4. '''
  5. RDFa parser. 
  6.  
  7. RDFa is a set of attributes used to embed RDF in XHTML. An important goal of
  8. RDFa is to achieve this RDF embedding without repeating existing XHTML content
  9. when that content is the metadata.  
  10.  
  11. REFERENCES:
  12.  
  13. \thttp://www.w3.org/2001/sw/BestPractices/HTML/2005-rdfa-syntax
  14.  
  15. Copyright (c) 2006, Elias Torres <elias@torrez.us>
  16. Licensed to the public under the GNU GPL v2.
  17.  
  18. '''
  19. import sys
  20. import re
  21. import urllib
  22. import urlparse
  23. import cStringIO
  24. from xml.dom import pulldom
  25. __version__ = '$Id: rdfa.py 118 2006-06-03 18:35:18Z eliast $'
  26. rdfa_attribs = [
  27.     'about',
  28.     'property',
  29.     'rel',
  30.     'rev',
  31.     'href',
  32.     'content']
  33.  
  34. class NS(unicode):
  35.     
  36.     def __getattr__(self, name):
  37.         return self + name
  38.  
  39.  
  40. xhtml = NS('http://www.w3.org/1999/xhtml')
  41. xml = NS('http://www.w3.org/XML/1998/namespace')
  42. rdf = NS('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
  43.  
  44. class Node(unicode):
  45.     pass
  46.  
  47.  
  48. class URI(Node):
  49.     pass
  50.  
  51.  
  52. class bNode(Node):
  53.     pass
  54.  
  55.  
  56. class Literal(Node):
  57.     
  58.     def __new__(cls, lit, lang = None, dtype = None):
  59.         n = '"' + lit + '"'
  60.         if lang is not None:
  61.             n += '@' + str(lang)
  62.         elif dtype is not None:
  63.             n += '^^<' + str(dtype) + '>'
  64.         
  65.         return unicode.__new__(cls, n)
  66.  
  67.  
  68. _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
  69.  
  70. def _urljoin(base, uri):
  71.     uri = _urifixer.sub('\\1\\3', uri)
  72.     return urlparse.urljoin(base, uri)
  73.  
  74.  
  75. class RDFaParser:
  76.     
  77.     def __init__(self, sink, base = None, lang = None):
  78.         self.triple = sink.triple
  79.         if not base:
  80.             pass
  81.         self.baseuri = ''
  82.         if not lang:
  83.             pass
  84.         self.lang = None
  85.         self.abouts = []
  86.         self.xmlbases = []
  87.         self.langs = []
  88.         self.elementStack = [
  89.             None]
  90.         self.bcounter = { }
  91.         self.bnodes = { }
  92.  
  93.     
  94.     def generateBlankNode(self, parentNode):
  95.         name = parentNode.tagName
  96.         if self.bnodes.has_key(parentNode):
  97.             return self.bnodes[parentNode]
  98.         
  99.         if self.bcounter.has_key(name):
  100.             self.bcounter[name] = self.bcounter[name] + 1
  101.         else:
  102.             self.bcounter[name] = 0
  103.         self.bnodes[parentNode] = bNode('_:%s%d' % (name, self.bcounter[name]))
  104.         return self.bnodes[parentNode]
  105.  
  106.     
  107.     def extractCURIEorURI(self, resource):
  108.         if len(resource) > 0 and resource[0] == '[' and resource[-1] == ']':
  109.             resource = resource[1:-1]
  110.         
  111.         if resource.find(':') > -1:
  112.             (rpre, rsuf) = resource.split(':', 1)
  113.             for nsc in self.handler._ns_contexts:
  114.                 for ns, prefix in nsc.items():
  115.                     if prefix == rpre:
  116.                         resource = ns + rsuf
  117.                         continue
  118.                 
  119.             
  120.         
  121.         if len(resource) > 0 and resource[0:2] == '_:':
  122.             return bNode(resource)
  123.         
  124.         return URI(self.resolveURI(resource))
  125.  
  126.     
  127.     def resolveURI(self, uri):
  128.         if not self.baseuri:
  129.             pass
  130.         return _urljoin('', uri)
  131.  
  132.     
  133.     def _popStacks(self, event, node):
  134.         if len(self.abouts) != 0:
  135.             (about, aboutnode) = self.abouts[-1]
  136.             if aboutnode == node:
  137.                 self.abouts.pop()
  138.             
  139.         
  140.         self.elementStack.pop()
  141.         if self.xmlbases:
  142.             self.xmlbases.pop()
  143.             if self.xmlbases and self.xmlbases[-1]:
  144.                 self.baseuri = self.xmlbases[-1]
  145.             
  146.         
  147.         if self.langs:
  148.             self.langs.pop()
  149.             if self.langs and self.langs[-1]:
  150.                 self.lang = self.langs[-1]
  151.             
  152.         
  153.  
  154.     
  155.     def parse(self, stream):
  156.         events = pulldom.parse(stream)
  157.         self.handler = events.pulldom
  158.         for None in events:
  159.             (event, node) = None
  160.             if event == pulldom.START_DOCUMENT:
  161.                 self.abouts += [
  162.                     (URI(''), node)]
  163.             
  164.             if event == pulldom.END_DOCUMENT:
  165.                 if not len(self.elementStack) == 0:
  166.                     raise AssertionError
  167.             
  168.             if event == pulldom.START_ELEMENT:
  169.                 self.elementStack += [
  170.                     node]
  171.                 found = (filter,)((lambda x: x in node.attributes.keys()), rdfa_attribs)
  172.                 if not node.getAttributeNS(xml, 'base') and node.getAttribute('base'):
  173.                     pass
  174.                 baseuri = self.baseuri
  175.                 self.baseuri = _urljoin(self.baseuri, baseuri)
  176.                 self.xmlbases.append(self.baseuri)
  177.                 if node.hasAttributeNS(xml, 'lang') or node.hasAttribute('lang'):
  178.                     if not node.getAttributeNS(xml, 'lang'):
  179.                         pass
  180.                     lang = node.getAttribute('lang')
  181.                     if lang == '':
  182.                         lang = None
  183.                     
  184.                 else:
  185.                     lang = self.lang
  186.                 self.lang = lang
  187.                 self.langs.append(lang)
  188.                 if len(found) == 0:
  189.                     continue
  190.                 
  191.                 parentNode = self.elementStack[-2]
  192.                 if 'about' in found:
  193.                     self.abouts += [
  194.                         (self.extractCURIEorURI(node.getAttribute('about')), node)]
  195.                 
  196.                 subject = self.abouts[-1][0]
  197.                 if node.tagName == 'meta' or node.tagName == 'link':
  198.                     if 'about' not in found and parentNode:
  199.                         if parentNode.hasAttribute('about'):
  200.                             subject = self.extractCURIEorURI(parentNode.getAttribute('about'))
  201.                         elif parentNode.hasAttributeNS(xml, 'id') or parentNode.hasAttribute('id'):
  202.                             if not parentNode.getAttributeNS(xml, 'id'):
  203.                                 pass
  204.                             id = parentNode.getAttribute('id')
  205.                             subject = self.extractCURIEorURI('#' + id)
  206.                         else:
  207.                             subject = self.generateBlankNode(parentNode)
  208.                     
  209.                 
  210.                 if 'property' in found:
  211.                     predicate = self.extractCURIEorURI(node.getAttribute('property'))
  212.                     literal = None
  213.                     datatype = None
  214.                     if node.hasAttribute('datatype'):
  215.                         datatype = self.extractCURIEorURI(node.getAttribute('datatype'))
  216.                         if datatype == 'plaintext':
  217.                             datatype = None
  218.                         
  219.                     
  220.                     if node.hasAttribute('content'):
  221.                         literal = Literal(node.getAttribute('content'), lang = lang, dtype = datatype)
  222.                     else:
  223.                         events.expandNode(node)
  224.                         self._popStacks(event, node)
  225.                         content = ''
  226.                         for child in node.childNodes:
  227.                             content += child.toxml()
  228.                         
  229.                         content = content.strip()
  230.                         literal = Literal(content, dtype = rdf.XMLLiteral)
  231.                     if literal:
  232.                         self.triple(subject, predicate, literal)
  233.                     
  234.                 
  235.                 if 'rel' in found:
  236.                     predicate = self.extractCURIEorURI(node.getAttribute('rel'))
  237.                     if node.hasAttribute('href'):
  238.                         object = self.extractCURIEorURI(node.getAttribute('href'))
  239.                         self.triple(subject, predicate, object)
  240.                     
  241.                 
  242.                 if 'rev' in found:
  243.                     predicate = self.extractCURIEorURI(node.getAttribute('rev'))
  244.                     if node.hasAttribute('href'):
  245.                         object = self.extractCURIEorURI(node.getAttribute('href'))
  246.                         self.triple(object, predicate, subject)
  247.                     
  248.                 
  249.             
  250.             if event == pulldom.END_ELEMENT:
  251.                 self._popStacks(event, node)
  252.                 continue
  253.         
  254.  
  255.  
  256.  
  257. class Sink(object):
  258.     
  259.     def __init__(self):
  260.         self.result = ''
  261.  
  262.     
  263.     def __str__(self):
  264.         return self.result
  265.  
  266.     
  267.     def triple(self, s, p, o):
  268.         if o.__class__ is URI:
  269.             o = '<' + o + '>'
  270.         
  271.         if s.__class__ is URI:
  272.             s = '<' + s + '>'
  273.         
  274.         self.result += '%s <%s> %s .\n' % (s, p, o)
  275.  
  276.  
  277.  
  278. def parseRDFa(s, base = None, sink = None):
  279.     print 'sink is ', sink
  280.     if not sink:
  281.         pass
  282.     sink = Sink()
  283.     parser = RDFaParser(sink, base)
  284.     parser.parse(cStringIO.StringIO(s))
  285.     return sink
  286.  
  287.  
  288. def parseURI(uri, sink = None):
  289.     return parseRDFa(urllib.urlopen(uri).read(), base = uri, sink = sink)
  290.  
  291. if __name__ == '__main__':
  292.     if len(sys.argv) != 2:
  293.         print __doc__
  294.     else:
  295.         print parseURI(sys.argv[1])
  296.  
  297.